Libraries
library(librarian)
librarian::shelf(dplyr, stringr, checkmate,
Seurat, SeuratDisk,
ggplot2, gridExtra, cowplot,
quiet = TRUE)
if(!exists("databringr", mode="function")) source("utils.R")
Set working directory:
if (Sys.getenv("RSTUDIO")==1) {
# setwd to where the editor is, if the IDE is RStudio
setwd( dirname(rstudioapi::getSourceEditorContext(id = NULL)$path) )
} else {
# setwd to where the editor is in a general way - maybe less failsafe than the previous
setwd(here::here())
# the following checks that the latter went well, but assuming
# that the user has not changed the name of the repo
d <- str_split(getwd(), fsep)[[1]][length(str_split(getwd(), fsep)[[1]])]
if (d != 'Puigetal2023_bioinformatics_scripts') { stop(
paste0("Could not set working directory automatically to where this",
" script resides.\nPlease do `setwd()` manually"))
}
}
To save images outside the repo (to reduce size):
figdir <- paste0(c(head(str_split(getwd(), fsep)[[1]],-1),
paste0(tail(str_split(getwd(), fsep)[[1]],1), '_figures')),
collapse = fsep)
dir.create(figdir, showWarnings = FALSE)
data1)These data come from Hung RJ, Hu Y, Kirchner R, et al. A cell atlas of the adult Drosophila midgut. Proc Natl Acad Sci U S A. 2020;117(3):1514-1523. doi:[10.1073/pnas.1916820117](https://doi.org/10.1073/pnas.1916820117).
The processed data are available at the GEO with accession number GSE120537. However, the data were generated using 10x and inDrop technologies, and the supplementary files at GEO do not contain the fully integrated assay and clusters. Integration could be performed using Hung et al. (2020) published pipeline, but this can be somewhat machine-dependent (see e.g. here). Therefore, we will use the fully integrated dataset generated by Hung et al., kindly provided by Dr Claire Yanhui Hu.
As the dataset has already been integrated, let’s select the raw data as the default assay. Let’s add one more metadata to this set of cells indicating that they come from ‘dataset1’. Then we remove the mitochondrial and ribosomal genes and separate the dataset by technology, obtaining the two samples.
# get the dataset
inpath <- file.path(getwd(), 'input')
if (!file.exists( file.path(inpath, 'integrated_Hung2020.rds') )) {
databringr(from = 'http://genomics.essex.ac.uk/flygut/integrated_Hung2020.rds',
to = file.path(inpath, 'integrated_Hung2020.rds'))
}
# load data
data1 <- readRDS(file.path("input", "integrated_Hung2020.rds"))
as.data.frame.table(table((data1)$technology)) %>%
dplyr::rename(celltype=Var1, ncells=Freq) %>%
knitr::kable()
| celltype | ncells |
|---|---|
| 10x | 2979 |
| inDrop | 7626 |
# check mitochondrial/ribosomal genes
DefaultAssay(data1) <- "RNA"
data1[["percent.mt"]] <- PercentageFeatureSet(data1, pattern = '^mt:+')
data1[["percent.ribo"]] <- PercentageFeatureSet(data1, pattern = '^Rp[SL]')
VlnPlot(data1, features = c("nFeature_RNA", "nCount_RNA", "percent.mt",
"percent.ribo"))
DimPlot(data1, reduction = "umap", group.by = "integrated_celltype")
# quick check at present cell types
as.data.frame.table(table((data1)$integrated_celltype)) %>%
dplyr::rename(celltype=Var1, ncells=Freq) %>%
knitr::kable()
| celltype | ncells |
|---|---|
| 0 | 2213 |
| 10 | 336 |
| 16 | 237 |
| 19 | 89 |
| 2 | 806 |
| 20 | 54 |
| 21 | 36 |
| 9 | 394 |
| aEC | 1828 |
| cardia | 225 |
| EE | 849 |
| iron/copper | 559 |
| ISC/EB | 799 |
| LFC | 549 |
| mEC | 449 |
| pEC | 1182 |
# add dataset label
data1@meta.data[["data"]] <- "data1"
data1@meta.data[["celltype"]] <- data1@meta.data[["integrated_celltype"]]
# remove ribosomal and mitochondrial genes
data1 <- data1[ ! grepl('^Rp[SL]', rownames(data1)), ]
data1 <- data1[ ! grepl('^mt:+', rownames(data1)), ]
# divide cells by scRNAseq technology
object_1 <- SplitObject(data1, split.by = "technology")
rm(data1)
tenx_1 <- object_1[["10x"]]
indrop <- object_1[["inDrop"]]
rm('object_1')
data2)These data come from Li H, Janssens J, De Waegeneer M, et al. Fly Cell Atlas: A single-nucleus transcriptomic atlas of the adult fruit fly. Science. 2022;375(6584):eabk2432. doi:[10.1126/science.abk2432](https://doi.org/10.1126/science.abk2432).
Links to integrated 10x and Smart-seq2 intestine data are available
from the Fly Cell Atlas
website. The data is in h5ad format, so we need to
convert it to a readable format for Seurat. We did the same steps as the
previous data set and separated the data by technology and sex,
obtaining individual samples.
if(file.exists( file.path(inpath, 'integrated_Li2022.rds') )) {
data2 <- readRDS( file.path(inpath, 'integrated_Li2022.rds') )
} else {
if (!file.exists( file.path(getwd(), 'input', 'data2.h5seurat') )) {
databringr(from = 'https://cloud.flycellatlas.org/index.php/s/8kHHzDSo48FLjTm/download/gut.h5ad',
to = file.path(inpath, 'data2.h5ad'))
Convert((file.path(inpath, "data2.h5ad")),
file.path(inpath, "data2.h5seurat"))
}
# load data, show cell types by tech and sex
data2 <- LoadH5Seurat(file.path("input", "data2.h5seurat"))
DefaultAssay(data2) <- "RNA"
# remove ribosomal/mitochondrial genes
data2 <- data2[ ! grepl('^Rp[SL]', rownames(data2)), ]
data2 <- data2[ ! grepl('^mt:+', rownames(data2)), ]
# add dataset label
data2@meta.data[["data"]] <- "data2"
saveRDS(data2, file.path(file.path(inpath, 'integrated_Li2022.rds')))
}
# visualisation: number of cells per tech/sex/type and UMAP
as.data.frame.table(table((data2)$technology)) %>%
dplyr::rename(celltype=Var1, ncells=Freq) %>%
knitr::kable()
| celltype | ncells |
|---|---|
| 10x | 11788 |
| ss2 | 594 |
as.data.frame.table(table((data2)$sex)) %>%
dplyr::rename(celltype=Var1, ncells=Freq) %>%
knitr::kable()
| celltype | ncells |
|---|---|
| female | 6640 |
| male | 5742 |
DimPlot(data2, reduction = "umap", group.by = "annotation",
label = TRUE, label.size = 3, repel = TRUE) + NoLegend()
as.data.frame.table(table((data2)$annotation)) %>%
dplyr::rename(celltype=Var1, ncells=Freq) %>%
knitr::kable()
| celltype | ncells |
|---|---|
| adult Malpighian tubule | 38 |
| adult differentiating enterocyte | 299 |
| adult fat body | 15 |
| adult midgut enterocyte | 282 |
| adult midgut-hindgut hybrid zone | 294 |
| adult pylorus | 191 |
| antimicrobial peptide-producing cell | 304 |
| cardia (1) | 1150 |
| cardia (2) | 318 |
| copper cell | 95 |
| crop | 4715 |
| enteroblast | 225 |
| enterocyte of anterior adult midgut epithelium | 779 |
| enterocyte of posterior adult midgut epithelium | 580 |
| enterocyte-like | 1027 |
| enteroendocrine cell | 104 |
| intestinal stem cell | 305 |
| male accessory gland | 56 |
| midgut large flat cell | 264 |
| muscle cell | 100 |
| unknown | 760 |
| visceral muscle of the crop | 205 |
| visceral muscle of the midgut | 276 |
Prepare for integration by splitting cells according to sex and technology.
# divide cells by scRNAseq technology and sex
object_2 <- SplitObject(data2, split.by = "technology")
rm(data2)
ss2 <- object_2[["ss2"]]
ss2_split <- SplitObject(ss2, split.by = "sex")
ss2_split[["male"]]@meta.data[["sample"]] <- "ss2_1"
ss2_split[["female"]]@meta.data[["sample"]] <- "ss2_2"
ss2 <- merge(ss2_split[["male"]], c(ss2_split[["female"]]),
add.cell.ids = c("male","female"))
tenx_2 <- object_2[["10x"]]
tenx_2_split <- SplitObject(tenx_2, split.by = "sex")
tenx_2_split[["male"]]@meta.data[["sample"]] <- "10x_3"
tenx_2_split[["female"]]@meta.data[["sample"]] <- "10x_4"
tenx_2 <- merge(tenx_2_split[["male"]], c(tenx_2_split[["female"]]),
add.cell.ids = c("male","female"))
rm('object_2', 'tenx_2_split', 'ss2_split')
We merged the two datasets. We scaled the data and did the dimensionality reduction before integration. We can see that the datasets have batch effects coming from technology and experiment. We must then proceed to data integration.
outpath <- file.path(getwd(), 'output')
if (!file.exists( file.path(outpath, 'non_integrated_merged.rds') )) {
alldata <- merge(indrop, c(ss2,tenx_1,tenx_2),
add.cell.ids = c("indrop","ss2","tenx_1","tenx_2"))
rm('indrop', 'ss2', 'tenx_1', 'tenx_2')
# variable features
alldata <- FindVariableFeatures(alldata, nfeatures = 2000)
top20 <- head(VariableFeatures(alldata), 20)
LabelPoints(plot = VariableFeaturePlot(alldata), points = top20, repel = TRUE)
# scaling and dimensionality reduction
alldata <- ScaleData(alldata, verbose = FALSE)
alldata <- RunPCA(alldata, npcs = 30, verbose = FALSE)
alldata <- RunUMAP(alldata, dims = 1:30, verbose = FALSE)
# cache
if (!file.exists( outpath )) dir.create( outpath )
saveRDS(alldata, file.path(outpath, 'non_integrated_merged.rds'))
} else {
alldata <- readRDS(file.path(outpath, 'non_integrated_merged.rds'))
}
# check batch effects by sample/tech
p1 <- DimPlot(alldata, reduction = "umap", group.by = "data")
p1
# segregation by technology/experiment
p2 <- DimPlot(alldata, reduction = "umap", group.by = "sample")
p2
We use the Seurat::IntegrateData function to integrate
based on sample (independent experiments by technology). The algorithm
identifies anchors, which are pairs of cross-datasets of cells that are
in a compatible biological state, correcting for batch effects.
if (file.exists( file.path(outpath, 'alldata-int-cache1.rds') )) {
alldata.int <- readRDS( file.path(outpath, 'alldata-int-cache1.rds') )
} else {
# find anchor candidates per sample
alldata.list <- SplitObject(alldata, split.by = "sample")
rm(alldata)
for (i in 1:length(alldata.list)) {
alldata.list[[i]] <- ScaleData(alldata.list[[i]], verbose = FALSE)
alldata.list[[i]] <- NormalizeData(alldata.list[[i]], verbose = FALSE)
alldata.list[[i]] <- FindVariableFeatures(alldata.list[[i]], selection.method = "vst",
nfeatures = 2000, verbose = FALSE)}
# identify anchors and integrate
alldata.anchors <- FindIntegrationAnchors(alldata.list, dims = 1:30, verbose = FALSE)
to_integrate <- Reduce(intersect, lapply(alldata.anchors@object.list, rownames))
# this step is quite intensive
alldata.int <- IntegrateData(alldata.anchors, dims = 1:30, new.assay.name = "CCA",
features.to.integrate = to_integrate,
verbose = FALSE)
# manual cache for knitting the RMD into html
saveRDS(alldata.int, file.path(outpath, 'alldata-int-cache1.rds'))
}
Now we can scale the integrated data, perform dimensionality reduction by PCA (retaining the 30 first principal components) and visualise with UMAP and tSNE. With this, we can confirm that cells were in compatible biological states. However, due to the presence of different cell types, we observed differences in the clustering patterns.
if (file.exists( file.path(outpath, 'alldata-int-cache2.rds') )) {
alldata.int <- readRDS( file.path(outpath, 'alldata-int-cache2.rds') )
} else {
# rescale & PCA
alldata.int <- ScaleData(alldata.int, verbose = FALSE)
alldata.int <- RunPCA(alldata.int, npcs = 50, verbose = FALSE)
# show
DimPlot(alldata.int, reduction = "pca", group.by = "sample")
DimHeatmap(alldata.int, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(alldata.int, dims = 1:15, cells = 500, balanced = TRUE)
ElbowPlot(alldata.int, ndims = 50)
# 30 PCs is enough
alldata.int <- RunUMAP(alldata.int, dims = 1:30, verbose = FALSE)
alldata.int <- RunTSNE(alldata.int, dims = 1:30)
saveRDS(alldata.int, file.path(outpath, 'alldata-int-cache2.rds'))
}
# cells from different datasets and experiments now mingle
p3 <- DimPlot(alldata.int, reduction = "umap", group.by = "data")
p3
p4 <- DimPlot(alldata.int, reduction = "umap", group.by = "sample")
p4
From here, we can use the dimensionally-reduced, integrated data to
group cells of the same cell type. We use the
Seurat::FindNeighbors function to calculate the KNN and SNN
graphs; FindClusters will detect clusters based on these
graphs using the “Louvain” algorithm (at resolution = 1).
alldata.int <- FindNeighbors(alldata.int, dims = 1:30)
# explore resolution parameter (more resolution gives more clusters)
for (res in c(0.1, 0.25, 0.5, 0.6, 1, 1.5, 2)) {
alldata.int <- FindClusters(alldata.int, resolution = res, algorithm = 1)
}
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 22987
## Number of edges: 941777
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9836
## Number of communities: 17
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 22987
## Number of edges: 941777
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9727
## Number of communities: 26
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 22987
## Number of edges: 941777
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9583
## Number of communities: 26
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 22987
## Number of edges: 941777
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9530
## Number of communities: 27
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 22987
## Number of edges: 941777
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9353
## Number of communities: 37
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 22987
## Number of edges: 941777
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9183
## Number of communities: 43
## Elapsed time: 3 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 22987
## Number of edges: 941777
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9032
## Number of communities: 50
## Elapsed time: 3 seconds
plot_grid(ncol = 3,
DimPlot(alldata.int, reduction = "umap", group.by = "CCA_snn_res.0.1") +
NoAxes() + ggtitle("louvain_0.1"),
DimPlot(alldata.int, reduction = "umap", group.by = "CCA_snn_res.0.25") +
NoAxes() + ggtitle("louvain_0.25"),
DimPlot(alldata.int, reduction = "umap", group.by = "CCA_snn_res.0.5") +
NoAxes() + ggtitle("louvain_0.5"),
DimPlot(alldata.int, reduction = "umap", group.by = "CCA_snn_res.1") +
NoAxes() + ggtitle("louvain_1"),
DimPlot(alldata.int, reduction = "umap", group.by = "CCA_snn_res.1.5") +
NoAxes() + ggtitle("louvain_1.5"),
DimPlot(alldata.int, reduction = "umap", group.by = "CCA_snn_res.2") +
NoAxes() + ggtitle("louvain_2")
)
# choose resolution = 1
alldata.int <- SetIdent(alldata.int, value = "CCA_snn_res.1")
table(alldata.int@active.ident)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1379 1374 1309 1181 1082 1077 1035 938 906 870 815 779 765 681 653 652
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 649 624 577 569 554 444 443 428 391 355 341 325 293 283 269 259
## 32 33 34 35 36
## 251 231 114 58 33
DimPlot(alldata.int, reduction = "tsne", group.by = "CCA_snn_res.1")
DefaultAssay(alldata.int) <- "RNA"
To help with the annotation of clusters, we start with a heatmap of
the markers described by Hung et al. (2020). The file
cell_type_markers.csv was also provided by Dr Y Hu.
# read markers
markers <- readr::read_csv(file.path("resources","cell_type_markers.csv"), col_types = 'cc')
knitr::kable(markers)
| cell_identity | genes |
|---|---|
| ISC | Dl |
| ISC | Smvt |
| ISC | CG10006 |
| pISC | sna |
| ISCR2R5 | polo |
| ISC/EB | esg |
| EB | E(spl)m3-HLH |
| EB | E(spl)malpha-BFM |
| EB | E(spl)mbeta-HLH |
| EB | klu |
| EE | pros |
| EE | 7B2 |
| EE_subtype | AstA |
| EE_subtype | AstC |
| EE_subtype | CCHa1 |
| EE_subtype | CCHa2 |
| EE_subtype | Tk |
| EE_subtype | NPF |
| EE_subtype | Orcokinin |
| EE_subtype | Dh31 |
| aEC | nub |
| aEC | Myo31DF |
| mEC | nub |
| mEC | Myo31DF |
| pEC | nub |
| pEC | Myo31DF |
| aEC | betaTry |
| pEC | lambdaTry |
| mEC | lab |
| mEC | Vha100-4 |
| cardia | Pgant4 |
| copper | PGRP-LA |
| copper | PGRP-LC |
| copper | Apoltp |
| VM | Vn |
| LFC | Ilp3 |
| LFC | PGRP-SC1a |
| LFC | PGRP-SC1b |
| LFC | PGRP-SD |
| iron | PGRP-SC2 |
| iron | ZIP1(Zip42C.1) |
# set integrated clustering as cell identity
Idents(alldata.int) <- alldata.int@meta.data$CCA_snn_res.1
# get genome-wide gene expression per cluster, and visualise for marker genes
cluster.averages <- AverageExpression(alldata.int, return.seurat=TRUE, add.ident="sample")
DoHeatmap(cluster.averages, features=head(markers, 20)$genes, size=2)
DoHeatmap(cluster.averages, features=tail(markers, 21)$genes, size=2)
clusters <- alldata.int@meta.data$CCA_snn_res.1
# assign cell type to cluster
celltype <- case_when(
clusters %in% c(0) ~ "enterocyte-like",
clusters %in% c(1,8,9,11,12,33) ~ "crop",
clusters %in% c(2) ~ "intestinal stem cell / enteroblast",
clusters %in% c(3,5,6,14,19,20,23,36) ~ "enterocyte of anterior midgut epithelium",
clusters %in% c(4,10,27) ~ "cardia",
clusters %in% c(7) ~ "midgut large flat cell",
clusters %in% c(13,16,21,31) ~ "enterocyte of posterior midgut epithelium",
clusters %in% c(15,22,29) ~ "midgut enterocyte",
clusters %in% c(17,28) ~ "enteroendocrine cell",
clusters %in% c(18) ~ "muscle cell",
clusters %in% c(24) ~ "differentiating enterocyte",
clusters %in% c(25) ~ "antimicrobial peptide-producing cell",
clusters %in% c(26) ~ "midgut-hindgut hybrid zone",
clusters %in% c(30) ~ "pylorus",
clusters %in% c(34) ~ "male accessory gland",
clusters %in% c(35) ~ "malpighian tubule",
clusters %in% c(32) ~ "unkown",
TRUE ~ as.character(clusters))
# set integrated cell types as new cell identity
alldata.int@meta.data$integrated_celltype <- celltype
Idents(alldata.int) <- alldata.int@meta.data$integrated_celltype
# get genome-wide gene expression per cluster, and visualise
integratedcelltype.averages <- AverageExpression(alldata.int, return.seurat=TRUE)
DoHeatmap(integratedcelltype.averages, features=markers$genes, size=2)
DimPlot(alldata.int, group.by="integrated_celltype", label=TRUE)
DimPlot(alldata.int, group.by="integrated_celltype", label=TRUE, reduction = "pca")
DimPlot(alldata.int, group.by="integrated_celltype", label=TRUE, reduction = "tsne" )
This is something that could not be possible in Hung et al., (2020), but maybe now, with more cells, can be done. So we took a closer look at the ISC/EB group.
# extract ISC/EB cluster from all cells
alldata.list <- SplitObject(alldata.int, split.by = "integrated_celltype")
isc_cluster <- alldata.list[["intestinal stem cell / enteroblast"]]
# re-cluster independently
DefaultAssay(isc_cluster) <- "CCA"
isc_cluster <- FindNeighbors(isc_cluster, dims = 1:30)
isc_cluster <- FindClusters(isc_cluster, resolution = 1, algorithm = 1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1309
## Number of edges: 61492
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6613
## Number of communities: 7
## Elapsed time: 0 seconds
DefaultAssay(isc_cluster) <- "RNA"
Idents(isc_cluster) <- isc_cluster@meta.data$CCA_snn_res.1
table(Idents(isc_cluster) )
##
## 0 1 2 3 4 5 6
## 251 240 189 171 163 154 141
# evaluate marker expression in clusters
isc <- c("Dl", "Smvt", "sna", "polo", "stf", "cnn")
eb <- c("klu", "E(spl)m3-HLH", "E(spl)malpha-BFM", "E(spl)mbeta-HLH")
cluster.averages <- AverageExpression(isc_cluster, return.seurat=TRUE, add.ident="sample")
DoHeatmap(cluster.averages, features=isc, size=2)
DoHeatmap(cluster.averages, features=eb, size=2)
With this, we can now separate ISCs and EBs:
# discriminate ISCs/EBs based on expression of ISC/EB markers
clusters <- isc_cluster@meta.data$CCA_snn_res.1
celltype <- case_when(
clusters %in% c(1,5,6) ~ "enteroblast",
clusters %in% c(0,2,3,4) ~ "intestinal stem cell",
TRUE ~ as.character(clusters))
# apply and check marker expression
isc_cluster@meta.data$specific_celltype <- celltype
Idents(isc_cluster) <- isc_cluster@meta.data$specific_celltype
isc.eb_averages <- AverageExpression(isc_cluster, return.seurat=TRUE, add.ident="sample")
DoHeatmap(isc.eb_averages, features=isc, size=2)
DoHeatmap(isc.eb_averages, features=eb, size=2)
# check cell distribution in UMAP
DimPlot(isc_cluster, reduction = "umap", group.by = "specific_celltype")
# include this distinction in the full dataset
isc_celltype <- isc_cluster@meta.data[["specific_celltype"]]
names(isc_celltype) <- colnames(x = isc_cluster)
int_celltype <- alldata.int@meta.data[["integrated_celltype"]]
names(int_celltype) <- colnames(x = alldata.int)
data <- c(isc_celltype,int_celltype)
alldata.int <- AddMetaData(
object = alldata.int,
metadata = data,
col.name = 'specific_celltype'
)
Plots of the final integrated, annotated dataset:
DimPlot(alldata.int, reduction = "umap", group.by = "specific_celltype")
DimPlot(alldata.int, reduction = "tsne", group.by = "specific_celltype")
DefaultAssay(alldata.int) <- "RNA"
Save dataset.
as.data.frame.table(table((alldata.int)$specific_celltype)) %>%
dplyr::rename(celltype=Var1, ncells=Freq) %>%
knitr::kable()
| celltype | ncells |
|---|---|
| antimicrobial peptide-producing cell | 355 |
| cardia | 2222 |
| crop | 4925 |
| differentiating enterocyte | 391 |
| enteroblast | 535 |
| enterocyte of anterior midgut epithelium | 5530 |
| enterocyte of posterior midgut epithelium | 2033 |
| enterocyte-like | 1379 |
| enteroendocrine cell | 917 |
| intestinal stem cell | 774 |
| male accessory gland | 114 |
| malpighian tubule | 58 |
| midgut enterocyte | 1378 |
| midgut large flat cell | 938 |
| midgut-hindgut hybrid zone | 341 |
| muscle cell | 577 |
| pylorus | 269 |
| unkown | 251 |
saveRDS(alldata.int, file.path(outpath, "gut_int_annot.rds"))
We select the cell types that we are interested and, using the FindMarkers function, we find genes differentially expressed in each cell type. We will use these generated gene lists later for pseudotime analysis.
alldata.list <- SplitObject(alldata.int, split.by = "specific_celltype")
data <- alldat.int <- merge(
alldata.list[["enterocyte of anterior midgut epithelium"]],
c(alldata.list[["enterocyte of posterior midgut epithelium"]],
alldata.list[["intestinal stem cell"]],
alldata.list[["enteroblast"]],
alldata.list[["enteroendocrine cell"]]),
add.cell.ids = c("enterocyte of anterior adult midgut epithelium",
"enterocyte of posterior adult midgut epithelium",
"intestinal stem cell",
"enteroblast",
"enteroendocrine cell")
)
Idents(data) <- data@meta.data$specific_celltype
if (!file.exists( file.path(outpath, "markers-mast") )) {
dir.create(file.path(outpath, "markers-mast"))
}
if (!file.exists( file.path(outpath, "markers-mast", "enterocyte_anterior_mast.csv") )) {
markersmast <- FindMarkers(
data, ident.1="enterocyte of anterior midgut epithelium", test.use="MAST") %>%
as.data.frame() %>%
tibble::rownames_to_column("gene") %>%
arrange(p_val_adj)
readr::write_csv(markersmast,
file.path(outpath, "markers-mast",
"enterocyte_anterior_mast.csv"))
markersmast <- FindMarkers(
data, ident.1="enterocyte of posterior midgut epithelium", test.use="MAST") %>%
as.data.frame() %>%
tibble::rownames_to_column("gene") %>%
arrange(p_val_adj)
readr::write_csv(markersmast,
file.path(outpath, "markers-mast",
"enterocyte_posterior_mast.csv"))
markersmast <- FindMarkers(
data, ident.1="intestinal stem cell", test.use="MAST") %>%
as.data.frame() %>%
tibble::rownames_to_column("gene") %>%
arrange(p_val_adj)
readr::write_csv(markersmast,
file.path(outpath, "markers-mast",
"stem_cell_mast.csv"))
markersmast <- FindMarkers(
data, ident.1="enteroblast", test.use="MAST") %>%
as.data.frame() %>%
tibble::rownames_to_column("gene") %>%
arrange(p_val_adj)
readr::write_csv(markersmast,
file.path(outpath, "markers-mast",
"enteroblast_mast.csv"))
markersmast <- FindMarkers(
data, ident.1="enteroendocrine cell", test.use="MAST") %>%
as.data.frame() %>%
tibble::rownames_to_column("gene") %>%
arrange(p_val_adj)
readr::write_csv(markersmast,
file.path(outpath, "markers-mast",
"enteroendocrine_mast.csv"))
}
unlink( c(file.path(outpath, 'alldata-int-cache1.rds'),
file.path(outpath, 'alldata-int-cache2.rds')) )